started: Alexey Larionov, 01Mar2016
last updated: Alexey Larionov, 23Apr2019
Remove genotypes (set to NA) if:
1) genotypes gq < 20 (~12%)
2) genotypes dp < 10 (~18%, +5% after genotypes)
3) genotypes dp > 1000 (~1%)
4) Alt fraction
Remove variants with
1) call_rate < 0.85
(in each: nfe and wecare after the genotypes filtering)
this step removes ~73% of all variants !!!
2) uniform genotypes accross all samples
(created by the above filtering)
In addition
1) rename gt.mx to gt.mx
2) remove unnecessary matrices (gt_num, meta, gq, ref, alt); keep dp and gt_chr for later use
gq 20 filter is set arbitrary; however, it is consistent with what some othersis do
(e.g. see Carson BMC Bioinformatics. 2014 15:125).
A small number of genotypes (~1%) was covered too high to be true (above 1000 coverage).
These are obvious mistakes, and they have been removed too. Arbitrarily the threshold for
max DP was set to 1000 (appr. 10 fold of average coverage).
gt NA rate before filtering ~5% (~3% in wecare and ~12% in NFE respectively)
gt NA rate after filtering ~4% (~3% in wecare and ~6% in NFE respectively)
Input data: 8,275 vars x 739 cases (541 wecare + 198 nfe)
Output data: 1.886 vars x 739 cases (541 wecare + 198 nfe)
# Time stamp
Sys.time()
## [1] "2019-04-23 15:00:36 BST"
# Clenan-up
rm(list=ls())
graphics.off()
# Base folder
library(knitr)
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s07_filter_genotypes_and_variants"
opts_knit$set(root.dir = base_folder)
options(stringsAsFactors = F,
warnPartialMatchArgs = T,
warnPartialMatchAttr = T,
warnPartialMatchDollar = T)
#setwd(base_folder)
# Thresholds for genotypes
min.gq <- 20
min.dp <- 10
max.dp <- 1000
min.call.rate <- 0.85
# add parameters to filter by Alt fraction here?
load(paste(base_folder, "r01_filter_variants.RData", sep="/"))
gt.mx <- gt_add.mx
rm(gt_add.mx, gt_num.mx, meta.df) # keep gt_chr.mx for TsTv ratio check later
# List objects
ls()
## [1] "alt.mx" "base_folder" "dp.mx" "fixed.df" "gq.mx" "gt_chr.mx" "gt.mx" "max.dp" "min.call.rate" "min.dp" "min.gq" "ref.mx"
# Check sizes
dim(gt.mx)
## [1] 8275 739
dim(ref.mx)
## [1] 8275 739
dim(alt.mx)
## [1] 8275 739
dim(gq.mx)
## [1] 8275 739
dim(dp.mx)
## [1] 8275 739
dim(fixed.df)
## [1] 8275 110
colnames(fixed.df)
## [1] "CHROM" "POS" "ID" "REF" "ALT" "QUAL" "FILTER" "init_AC" "init_AF" "init_AN" "AS_FilterStatus" "AS_VQSLOD" "AS_culprit" "BaseQRankSum" "ClippingRankSum" "DB" "DP" "DS" "END" "ExcessHet" "FS" "HaplotypeScore" "InbreedingCoeff" "MLEAC" "MLEAF" "MQ" "MQRankSum" "NDA" "NEGATIVE_TRAIN_SITE" "POSITIVE_TRAIN_SITE" "QD" "ReadPosRankSum" "SOR" "SplitVarID" "VQSLOD" "culprit" "exac_non_TCGA.AC" "exac_non_TCGA.AC_AFR"
## [39] "exac_non_TCGA.AC_AMR" "exac_non_TCGA.AC_Adj" "exac_non_TCGA.AC_EAS" "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AC_FIN" "exac_non_TCGA.AC_Hemi" "exac_non_TCGA.AC_Het" "exac_non_TCGA.AC_Hom" "exac_non_TCGA.AC_MALE" "exac_non_TCGA.AC_NFE" "exac_non_TCGA.AC_OTH" "exac_non_TCGA.AC_SAS" "exac_non_TCGA.AF" "exac_non_TCGA.AN" "exac_non_TCGA.AN_AFR" "exac_non_TCGA.AN_AMR" "exac_non_TCGA.AN_Adj" "exac_non_TCGA.AN_EAS" "exac_non_TCGA.AN_FEMALE" "exac_non_TCGA.AN_FIN" "exac_non_TCGA.AN_MALE" "exac_non_TCGA.AN_NFE" "exac_non_TCGA.AN_OTH" "exac_non_TCGA.AN_SAS" "exac_non_TCGA.Hemi_AFR" "exac_non_TCGA.Hemi_AMR" "exac_non_TCGA.Hemi_EAS" "exac_non_TCGA.Hemi_FIN" "exac_non_TCGA.Hemi_NFE" "exac_non_TCGA.Hemi_OTH" "exac_non_TCGA.Hemi_SAS" "exac_non_TCGA.Het_AFR" "exac_non_TCGA.Het_AMR" "exac_non_TCGA.Het_EAS" "exac_non_TCGA.Het_FIN" "exac_non_TCGA.Het_NFE" "exac_non_TCGA.Het_OTH" "exac_non_TCGA.Het_SAS"
## [77] "exac_non_TCGA.Hom_AFR" "exac_non_TCGA.Hom_AMR" "exac_non_TCGA.Hom_EAS" "exac_non_TCGA.Hom_FIN" "exac_non_TCGA.Hom_NFE" "exac_non_TCGA.Hom_OTH" "exac_non_TCGA.Hom_SAS" "kgen.AC" "kgen.AF" "kgen.AFR_AF" "kgen.AMR_AF" "kgen.AN" "kgen.EAS_AF" "kgen.EUR_AF" "kgen.SAS_AF" "SYMBOL" "Allele" "Existing_variation" "Consequence" "IMPACT" "CLIN_SIG" "cDNA_position" "CDS_position" "Codons" "Protein_position" "Amino_acids" "DISTANCE" "STRAND" "SYMBOL_SOURCE" "Multiallelic" "SIFT_call" "SIFT_score" "PolyPhen_call" "PolyPhen_score"
# Check consistence of rownames and colnames
sum(rownames(gt.mx) != rownames(gq.mx))
## [1] 0
sum(rownames(gt.mx) != rownames(dp.mx))
## [1] 0
sum(rownames(gt.mx) != rownames(ref.mx))
## [1] 0
sum(rownames(gt.mx) != rownames(alt.mx))
## [1] 0
sum(colnames(gt.mx) != colnames(gq.mx))
## [1] 0
sum(colnames(gt.mx) != colnames(dp.mx))
## [1] 0
sum(colnames(gt.mx) != colnames(ref.mx))
## [1] 0
sum(colnames(gt.mx) != colnames(alt.mx))
## [1] 0
sum(rownames(gt.mx) != rownames(fixed.df))
## [1] 0
Genotypes NA rates
Histogram of call rates per variant
Histograms of gq and dp in non-NA genotypes
gt.mx[1:3,c(1,2,541,542,738,739)]
## 100_S8_L007 101_S528_L008 9_S346_L008 HG00097 NA20828 NA20832
## Var000000001 0 0 0 0 0 0
## Var000000002 0 0 0 0 0 0
## Var000000003 0 0 0 0 0 0
# Prepare matrices with sub-groups data
gt_nfe.mx <- gt.mx[,542:739]
gq_nfe.mx <- gq.mx[,542:739]
dp_nfe.mx <- dp.mx[,542:739]
gt_wecare.mx <- gt.mx[,1:541]
gq_wecare.mx <- gq.mx[,1:541]
dp_wecare.mx <- dp.mx[,1:541]
gt_nfe.mx[1:5,1:5]
## HG00097 HG00099 HG00100 HG00102 HG00106
## Var000000001 0 0 0 0 0
## Var000000002 0 0 0 0 0
## Var000000003 0 0 0 1 0
## Var000000004 1 2 1 1 2
## Var000000005 1 2 1 1 2
gt_wecare.mx[1:5,1:5]
## 100_S8_L007 101_S528_L008 102_S466_L008 103_S147_L007 104_S325_L008
## Var000000001 0 0 0 0 0
## Var000000002 0 0 0 0 0
## Var000000003 0 0 0 0 0
## Var000000004 0 2 2 2 1
## Var000000005 0 2 2 2 1
# --- Fractions of NA genotypes before filtering --- #
sum(is.na(gt.mx))/(nrow(gt.mx)*ncol(gt.mx)) # ~5%
## [1] 0.05399147
sum(is.na(gt_nfe.mx))/(nrow(gt_nfe.mx)*ncol(gt_nfe.mx)) # ~12%
## [1] 0.1254826
sum(is.na(gt_wecare.mx))/(nrow(gt_wecare.mx)*ncol(gt_wecare.mx)) # ~3%
## [1] 0.0278265
# --- Call rates per variant before filtering --- #
# All samples
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant before filtering\n739 wecare-nfe samples", xlab="Call rates")
# nfe
x <- ncol(gt_nfe.mx)
y <- apply(gt_nfe.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant before filtering\n198 nfe samples", xlab="Call rates")
# wecare
x <- ncol(gt_wecare.mx)
y <- apply(gt_wecare.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant before filtering\n541 wecare samples", xlab="Call rates")
# --- gq before filtering (when gt is not NA !) --- #
# All samples
hist(gq.mx[!is.na(gt.mx)], breaks=50,
main="Histogram of gq in non-NA genotypes before filtering\n739 wecare-nfe samples", xlab="gq")
hist(gq_nfe.mx[!is.na(gt_nfe.mx)], breaks=50,
main="Histogram of gq in non-NA genotypes before filtering\n198 nfe samples", xlab="gq")
hist(gq_wecare.mx[!is.na(gt_wecare.mx)], breaks=50,
main="Histogram of gq in non-NA genotypes before filtering\n541 wecare samples", xlab="gq")
# --- dp before filtering (when gt is not NA !) --- #
# All samples
hist(dp.mx[!is.na(gt.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes before filtering\n739 wecare-nfe samples", xlab="dp")
hist(dp.mx[!is.na(gt.mx)], breaks=2500, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes before filtering (zoom, 0:100)\n739 wecare-nfe samples", xlab="dp")
# nfe
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes before filtering\n198 nfe samples", xlab="dp")
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=2500, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes before filtering (zoom, 0:100)\n198 nfe samples", xlab="dp")
# wecare
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes before filtering\n541 wecare samples", xlab="dp")
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=2500, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes before filtering (zoom, 0:100)\n541 wecare samples", xlab="dp")
# Mean GQ and DP in non-NA genotypes before filtering
quantile(gq.mx[!is.na(gt.mx)], na.rm=TRUE) # median gq ~ 99
## 0% 25% 50% 75% 100%
## 0 45 99 99 99
quantile(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # median dp ~ 35
## 0% 25% 50% 75% 100%
## 0 16 35 53 9901
mean(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # mean dp ~89
## [1] 89.25717
# crude estimates for proportions of genotypes removed by filters
sum(dp.mx > max.dp, na.rm = T) / sum(!is.na(dp.mx))
## [1] 0.0122493
sum(dp.mx < min.dp, na.rm = T) / sum(!is.na(dp.mx))
## [1] 0.1797405
sum(gq.mx < min.gq, na.rm = T) / sum(!is.na(gq.mx))
## [1] 0.1190502
# Clean-up
rm(x,y, gt_nfe.mx, gq_nfe.mx, dp_nfe.mx, gt_wecare.mx, gq_wecare.mx, dp_wecare.mx)
Put NA to genotypes where gq < 20 : removes ~12% of non-NA genotypes
# num of genotypes to be removed (in !NA gt)
format(sum(gq.mx[!is.na(gt.mx)] < min.gq, na.rm=TRUE), big.mark=",") # ~687K
## [1] "686,697"
# Fraction of non-NA genotypes to be removed (in !NA gt)
sum(gq.mx[!is.na(gt.mx)] < min.gq, na.rm=TRUE)/sum(!is.na(gt.mx)) # ~12%
## [1] 0.1187019
# Apply filter (to gt only !)
NA -> gt.mx[ gq.mx < min.gq ]
NA -> gt_chr.mx[ gq.mx < min.gq ]
# Clean up
rm(min.gq)
# Prepare matrices with sub-groups data
gt_nfe.mx <- gt.mx[,542:739]
gq_nfe.mx <- gq.mx[,542:739]
dp_nfe.mx <- dp.mx[,542:739]
gt_wecare.mx <- gt.mx[,1:541]
gq_wecare.mx <- gq.mx[,1:541]
dp_wecare.mx <- dp.mx[,1:541]
# --- Fractions of NA genotypes after gq filtering --- #
sum(is.na(gt.mx))/(nrow(gt.mx)*ncol(gt.mx)) # ~17%
## [1] 0.1662845
sum(is.na(gt_nfe.mx))/(nrow(gt_nfe.mx)*ncol(gt_nfe.mx)) # ~22%
## [1] 0.225728
sum(is.na(gt_wecare.mx))/(nrow(gt_wecare.mx)*ncol(gt_wecare.mx)) # ~14%
## [1] 0.1445288
# --- Call rates per variant after gq filtering --- #
# All samples
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq filtering\n739 wecare-nfe samples", xlab="Call rates")
# nfe
x <- ncol(gt_nfe.mx)
y <- apply(gt_nfe.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq filtering\n198 nfe samples", xlab="Call rates")
# wecare
x <- ncol(gt_wecare.mx)
y <- apply(gt_wecare.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq filtering\n541 wecare samples", xlab="Call rates")
# --- gq after gq filtering (when gt is not NA !) --- #
# All samples
hist(gq.mx[!is.na(gt.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after gq filtering\n739 wecare-nfe samples", xlab="gq")
hist(gq_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after gq filtering\n198 nfe samples", xlab="gq")
hist(gq_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after gq filtering\n541 wecare samples", xlab="gq")
# --- dp after gq filtering (when gt is not NA !) --- #
# All samples
hist(dp.mx[!is.na(gt.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after gq filtering\n739 wecare-nfe samples", xlab="dp")
hist(dp.mx[!is.na(gt.mx)], breaks=2500, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after gq filtering (zoom 0:100)\n739 wecare-nfe samples", xlab="dp")
# nfe
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after gq filtering\n198 nfe samples", xlab="dp")
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=2500, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after gq filtering (zoom 0:100)\n198 nfe samples", xlab="dp")
# wecare
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after gq filtering\n541 wecare samples", xlab="dp")
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=2500, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after gq filtering (zoom 0:100)\n541 wecare samples", xlab="dp")
# Mean GQ and DP in non-NA genotypes after gq filtering
mean(gq.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 81
## [1] 81.69107
mean(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 100
## [1] 100.3238
# Clean-up
rm(x,y, gt_nfe.mx, gq_nfe.mx, dp_nfe.mx, gt_wecare.mx, gq_wecare.mx, dp_wecare.mx)
put NA to genotypes where dp < 10 : additionally removes ~5% of non-NA genotypes
# num of genotypes to be removed (in !NA gt after gq filtering)
format(sum(dp.mx[!is.na(gq.mx)] < min.dp, na.rm=TRUE), big.mark=",") # ~871K
## [1] "871,268"
# Fraction of genotypes to be removed (in !NA gt after gq filtering)
sum(dp.mx[!is.na(gq.mx)] < min.dp, na.rm=TRUE)/sum(!is.na(gt.mx)) # ~17%
## [1] 0.1708919
# Apply filter (to gt only !)
NA -> gt.mx[ dp.mx < min.dp ]
NA -> gt_chr.mx[ dp.mx < min.dp ]
# Clean up
rm(min.dp)
# Prepare matrices with sub-groups data
gt_nfe.mx <- gt.mx[,542:739]
gq_nfe.mx <- gq.mx[,542:739]
dp_nfe.mx <- dp.mx[,542:739]
gt_wecare.mx <- gt.mx[,1:541]
gq_wecare.mx <- gq.mx[,1:541]
dp_wecare.mx <- dp.mx[,1:541]
# --- Fractions of NA genotypes after gq filtering --- #
sum(is.na(gt.mx))/(nrow(gt.mx)*ncol(gt.mx)) # ~21%
## [1] 0.2075736
sum(is.na(gt_nfe.mx))/(nrow(gt_nfe.mx)*ncol(gt_nfe.mx)) # ~25%
## [1] 0.2469261
sum(is.na(gt_wecare.mx))/(nrow(gt_wecare.mx)*ncol(gt_wecare.mx)) # ~19%
## [1] 0.193171
# --- Call rates per variant after gq-dp filtering --- #
# All samples
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-low-dp filtering\n739 wecare-nfe samples", xlab="Call rates")
# nfe
x <- ncol(gt_nfe.mx)
y <- apply(gt_nfe.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-low-dp filtering\n198 nfe samples", xlab="Call rates")
# wecare
x <- ncol(gt_wecare.mx)
y <- apply(gt_wecare.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-low-dp filtering\n541 wecare samples", xlab="Call rates")
# --- gq after gq-dp filtering (when gt is not NA !) --- #
# All samples
hist(gq.mx[!is.na(gt.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after gq-low-dp filtering\n739 wecare-nfe samples", xlab="gq")
hist(gq_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after gq-low-dp filtering\n198 nfe samples", xlab="gq")
hist(gq_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after gq-low-dp filtering\n541 wecare samples", xlab="gq")
# --- dp after gq-dp filtering (when gt is not NA !) --- #
# All samples
hist(dp.mx[!is.na(gt.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after gq-low-dp filtering\n739 wecare-nfe samples", xlab="dp")
hist(dp.mx[!is.na(gt.mx)], breaks=2500, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after gq-low-dp filtering (zoom, 0:100)\n739 wecare-nfe samples", xlab="dp")
# nfe
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after gq-low-dp filtering\n198 nfe samples", xlab="dp")
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=250, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after gq-low-dp filtering (zoom, 0:100)\n198 nfe samples", xlab="dp")
# wecare
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after gq-low-dp filtering\n541 wecare samples", xlab="dp")
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=2500, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after gq-low-dp filtering (zoom, 0:100)\n541 wecare samples", xlab="dp")
# Mean GQ and DP in non-NA genotypes after gq-dp filtering
mean(gq.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 85
## [1] 84.50231
mean(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 105
## [1] 105.1593
# Clean-up
rm(x,y, gt_nfe.mx, gq_nfe.mx, dp_nfe.mx, gt_wecare.mx, gq_wecare.mx, dp_wecare.mx)
put NA to genotypes where dp > 1000 : removes <<1% of non-NA genotypes
# num of genotypes to be removed (in !NA gt after previous filtering)
format(sum(dp.mx[!is.na(gq.mx)] > max.dp, na.rm=TRUE), big.mark=",") # ~73K
## [1] "72,630"
# Fraction of genotypes to be removed (in !NA gt after previous filtering)
sum(dp.mx[!is.na(gq.mx)] > max.dp, na.rm=TRUE)/sum(!is.na(gt.mx)) # ~1%
## [1] 0.01498803
# Apply filter (to gt only !)
NA -> gt.mx[ dp.mx > max.dp ]
NA -> gt_chr.mx[ dp.mx > max.dp ]
# Clean up
rm(max.dp)
# Prepare matrices with sub-groups data
gt_nfe.mx <- gt.mx[,542:739]
gq_nfe.mx <- gq.mx[,542:739]
dp_nfe.mx <- dp.mx[,542:739]
gt_wecare.mx <- gt.mx[,1:541]
gq_wecare.mx <- gq.mx[,1:541]
dp_wecare.mx <- dp.mx[,1:541]
# --- Fractions of NA genotypes after gq-dp filtering --- #
sum(is.na(gt.mx))/(nrow(gt.mx)*ncol(gt.mx)) # ~22%
## [1] 0.2194281
sum(is.na(gt_nfe.mx))/(nrow(gt_nfe.mx)*ncol(gt_nfe.mx)) # ~25%
## [1] 0.2469261
sum(is.na(gt_wecare.mx))/(nrow(gt_wecare.mx)*ncol(gt_wecare.mx)) # ~20%
## [1] 0.2093641
# --- Call rates per variant after gq-dp filtering --- #
# All samples
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-dp filtering\n739 wecare-nfe samples", xlab="Call rates")
# nfe
x <- ncol(gt_nfe.mx)
y <- apply(gt_nfe.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-dp filtering\n198 nfe samples", xlab="Call rates")
# wecare
x <- ncol(gt_wecare.mx)
y <- apply(gt_wecare.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-dp filtering\n541 wecare samples", xlab="Call rates")
# --- gq after gq-dp filtering (when gt is not NA !) --- #
# All samples
hist(gq.mx[!is.na(gt.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after gq-dp filtering\n739 wecare-nfe samples", xlab="gq")
hist(gq_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after gq-dp filtering\n198 nfe samples", xlab="gq")
hist(gq_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after gq-dp filtering\n541 wecare samples", xlab="gq")
# --- dp after gq-dp filtering (when gt is not NA !) --- #
# All samples
hist(dp.mx[!is.na(gt.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after gq-dp filtering\n739 wecare-nfe samples", xlab="dp")
hist(dp.mx[!is.na(gt.mx)], breaks=250, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after gq-dp filtering (zoom, 0:100)\n739 wecare-nfe samples", xlab="dp")
# nfe
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after gq-dp filtering\n198 nfe samples", xlab="dp")
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=250, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after gq-dp filtering (zoom, 0:100)\n198 nfe samples", xlab="dp")
# wecare
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after gq-dp filtering\n541 wecare samples", xlab="dp")
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=250, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after gq-dp filtering (zoom, 0:100)\n541 wecare samples", xlab="dp")
# Mean GQ and DP in non-NA genotypes after gq-dp filtering
mean(gq.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 84
## [1] 84.28232
mean(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 73
## [1] 73.08944
# Clean-up
rm(x,y, gt_nfe.mx, gq_nfe.mx, dp_nfe.mx, gt_wecare.mx, gq_wecare.mx, dp_wecare.mx)
# Prepare alt_fraction.mx
alt_fraction.mx <- alt.mx / (alt.mx + ref.mx)
dim(alt_fraction.mx)
## [1] 8275 739
sum(is.na(alt_fraction.mx))
## [1] 336781
sum(is.na(gt.mx))
## [1] 1341852
# Only consider alt_fraction, where genotypes are not NA
NA -> alt_fraction.mx[is.na(gt.mx)]
sum(is.na(alt_fraction.mx))
## [1] 1348217
sum(is.na(gt.mx))
## [1] 1341852
# Explore alt_fraction in hom.ref
alt_fraction_hom_ref.mx <- alt_fraction.mx
NA -> alt_fraction_hom_ref.mx[gt.mx != 0]
sum(!is.na(alt_fraction_hom_ref.mx))
## [1] 4530694
hist(alt_fraction_hom_ref.mx, xlim=c(0,1))
sum(alt_fraction_hom_ref.mx > 0.2, na.rm = T)
## [1] 81
# Explore alt_fraction in het
alt_fraction_het.mx <- alt_fraction.mx
NA -> alt_fraction_het.mx[gt.mx != 1]
sum(!is.na(alt_fraction_het.mx))
## [1] 154851
hist(alt_fraction_het.mx, xlim=c(0,1))
sum(alt_fraction_het.mx < 0.2, na.rm = T)
## [1] 5988
sum(alt_fraction_het.mx > 0.8, na.rm = T)
## [1] 1341
# Explore alt_fraction in hom.alt
alt_fraction_hom_alt.mx <- alt_fraction.mx
NA -> alt_fraction_hom_alt.mx[gt.mx != 2]
sum(!is.na(alt_fraction_hom_alt.mx))
## [1] 81463
hist(alt_fraction_hom_alt.mx, xlim=c(0,1))
sum(alt_fraction_hom_alt.mx < 0.2, na.rm = T)
## [1] 31
# Clean-up (keep matrices needed in teh next chunk!)
rm(alt_fraction.mx)
NA -> gt.mx[alt_fraction_hom_ref.mx > 0.2]
NA -> gt_chr.mx[alt_fraction_hom_ref.mx > 0.2]
NA -> gt.mx[alt_fraction_het.mx > 0.8]
NA -> gt_chr.mx[alt_fraction_het.mx > 0.8]
NA -> gt.mx[alt_fraction_het.mx < 0.2]
NA -> gt_chr.mx[alt_fraction_het.mx < 0.2]
NA -> gt.mx[alt_fraction_hom_alt.mx < 0.8]
NA -> gt_chr.mx[alt_fraction_hom_alt.mx < 0.8]
rm(alt_fraction_hom_ref.mx, alt_fraction_het.mx, alt_fraction_hom_alt.mx)
# Prepare alt_fraction.mx
alt_fraction.mx <- alt.mx / (alt.mx + ref.mx)
dim(alt_fraction.mx)
## [1] 8275 739
sum(is.na(alt_fraction.mx))
## [1] 336781
sum(is.na(gt.mx))
## [1] 1349295
# Only consider alt_fraction, where genotypes are not NA
NA -> alt_fraction.mx[is.na(gt.mx)]
sum(is.na(alt_fraction.mx))
## [1] 1355660
sum(is.na(gt.mx))
## [1] 1349295
# Explore alt_fraction in hom.ref
alt_fraction_hom_ref.mx <- alt_fraction.mx
NA -> alt_fraction_hom_ref.mx[gt.mx != 0]
sum(!is.na(alt_fraction_hom_ref.mx))
## [1] 4530613
hist(alt_fraction_hom_ref.mx, xlim=c(0,1))
sum(alt_fraction_hom_ref.mx > 0.2, na.rm = T)
## [1] 0
# Explore alt_fraction in het
alt_fraction_het.mx <- alt_fraction.mx
NA -> alt_fraction_het.mx[gt.mx != 1]
sum(!is.na(alt_fraction_het.mx))
## [1] 147522
hist(alt_fraction_het.mx, xlim=c(0,1))
sum(alt_fraction_het.mx < 0.2, na.rm = T)
## [1] 0
sum(alt_fraction_het.mx > 0.8, na.rm = T)
## [1] 0
# Explore alt_fraction in hom.alt
alt_fraction_hom_alt.mx <- alt_fraction.mx
NA -> alt_fraction_hom_alt.mx[gt.mx != 2]
sum(!is.na(alt_fraction_hom_alt.mx))
## [1] 81430
hist(alt_fraction_hom_alt.mx, xlim=c(0,1))
sum(alt_fraction_hom_alt.mx < 0.2, na.rm = T)
## [1] 0
# Clean-up
rm(alt_fraction.mx, alt_fraction_hom_ref.mx, alt_fraction_het.mx,
alt_fraction_hom_alt.mx, ref.mx, alt.mx)
# Prepare matrices with sub-groups data
gt_nfe.mx <- gt.mx[,542:739]
gq_nfe.mx <- gq.mx[,542:739]
dp_nfe.mx <- dp.mx[,542:739]
gt_wecare.mx <- gt.mx[,1:541]
gq_wecare.mx <- gq.mx[,1:541]
dp_wecare.mx <- dp.mx[,1:541]
# --- Fractions of NA genotypes after gq-dp-af filtering --- #
sum(is.na(gt.mx))/(nrow(gt.mx)*ncol(gt.mx)) # ~22%
## [1] 0.2206452
sum(is.na(gt_nfe.mx))/(nrow(gt_nfe.mx)*ncol(gt_nfe.mx)) # ~25%
## [1] 0.2472721
sum(is.na(gt_wecare.mx))/(nrow(gt_wecare.mx)*ncol(gt_wecare.mx)) # ~21%
## [1] 0.2109
# --- Call rates per variant after gq-dp-af filtering --- #
# All samples
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-dp-af filtering\n739 wecare-nfe samples", xlab="Call rates")
# nfe
x <- ncol(gt_nfe.mx)
y <- apply(gt_nfe.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-dp-af filtering\n198 nfe samples", xlab="Call rates")
# wecare
x <- ncol(gt_wecare.mx)
y <- apply(gt_wecare.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-dp-af filtering\n541 wecare samples", xlab="Call rates")
# --- gq after gq-dp-af filtering (when gt is not NA !) --- #
# All samples
hist(gq.mx[!is.na(gt.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after gq-dp-af filtering\n739 wecare-nfe samples", xlab="gq")
hist(gq_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after gq-dp-af filtering\n198 nfe samples", xlab="gq")
hist(gq_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after gq-dp-af filtering\n541 wecare samples", xlab="gq")
# --- dp after gq-dp-af filtering (when gt is not NA !) --- #
# All samples
hist(dp.mx[!is.na(gt.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after gq-dp-af filtering\n739 wecare-nfe samples", xlab="dp")
hist(dp.mx[!is.na(gt.mx)], breaks=250, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after gq-dp-af filtering (zoom, 0:100)\n739 wecare-nfe samples", xlab="dp")
# nfe
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after gq-dp-af filtering\n198 nfe samples", xlab="dp")
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=250, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after gq-dp-af filtering (zoom, 0:100)\n198 nfe samples", xlab="dp")
# wecare
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after gq-dp-af filtering\n541 wecare samples", xlab="dp")
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=250, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after gq-dp-af filtering (zoom, 0:100)\n541 wecare samples", xlab="dp")
# Mean GQ and DP in non-NA genotypes after gq-dp-af filtering
mean(gq.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 84
## [1] 84.31422
mean(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 73
## [1] 73.09395
# Clean-up
rm(x,y, gt_nfe.mx, gq_nfe.mx, dp_nfe.mx, gt_wecare.mx, gq_wecare.mx, dp_wecare.mx)
Remove variants with call rate < 85% in either nfe or wecare after the genotypes filtering:
removes 6,065 (~73%) variants (8,275 -> 2,210)
Most of the variants were removed because of low call rate in NFE
dim(gt.mx)
## [1] 8275 739
# Estimate callrates in WECARE samples
wecare_cases <- c(rep(T,541),rep(F,198))
length(wecare_cases)
## [1] 739
sum(wecare_cases)
## [1] 541
x_wecare <- ncol(gt.mx[,wecare_cases])
y_wecare <- apply(gt.mx[,wecare_cases], 1, function(z){1-sum(is.na(z))/x_wecare})
length(y_wecare)
## [1] 8275
y_wecare[1:7]
## Var000000001 Var000000002 Var000000003 Var000000004 Var000000005 Var000000006 Var000000010
## 0.9889094 0.9889094 0.9426987 0.7338262 0.5730129 0.7707948 0.9020333
var_wecare.retained <- y_wecare >= min.call.rate
sum(var_wecare.retained) # 4,916 to be retained
## [1] 4916
1 - sum(var_wecare.retained)/nrow(gt.mx[,wecare_cases]) # ~40% to be removed
## [1] 0.4059215
# Estimate callrates in NFE samples
nfe_cases <- !wecare_cases
sum(nfe_cases)
## [1] 198
x_nfe <- ncol(gt.mx[,nfe_cases])
y_nfe <- apply(gt.mx[,nfe_cases], 1, function(z){1-sum(is.na(z))/x_nfe})
length(y_nfe)
## [1] 8275
y_nfe[1:7]
## Var000000001 Var000000002 Var000000003 Var000000004 Var000000005 Var000000006 Var000000010
## 0.3636364 0.8282828 0.8585859 0.9090909 0.9090909 0.9141414 0.5151515
var_nfe.retained <- y_nfe >= min.call.rate
sum(var_nfe.retained) # 3,565 to be retained
## [1] 3565
1 - sum(var_nfe.retained)/nrow(gt.mx[,nfe_cases]) # ~57% to be removed
## [1] 0.5691843
# Estimate the proportion of variants to be removed
var.retained <- var_wecare.retained & var_nfe.retained
sum(var.retained) # 2,210 to be retained
## [1] 2210
1 - sum(var.retained)/nrow(gt.mx) # ~73% to be removed
## [1] 0.7329305
# Remove variants with loaw call rates
gt.mx <- gt.mx[ var.retained, ]
gt_chr.mx <- gt_chr.mx[ var.retained, ]
dp.mx <- dp.mx[ var.retained, ]
gq.mx <- gq.mx[ var.retained, ]
fixed.df <- fixed.df[ var.retained, ]
# Clean-up
rm(min.call.rate, var.retained, x_wecare, y_wecare, x_nfe, y_nfe,
wecare_cases, nfe_cases, var_wecare.retained, var_nfe.retained)
Remove 324 variants: 2,210 -> 1,886
In some variants the diverse genotypes were removed during the above filtering
# Check that there is no all-NA variants
non_NA_count.udf <- function(x){sum(!is.na(x))}
all_NA <- apply(gt.mx, 1, non_NA_count.udf) == 0
sum(all_NA) # 0
## [1] 0
# Function to detect uniform numeric vector
uniform_vector.udf <- function(x){
if(min(x, na.rm=TRUE) == max(x, na.rm=TRUE)){return(TRUE)} else {return(FALSE)}}
# Variants with uniform genotypes accross all samples
uniform_genotypes <- apply(gt.mx, 1, uniform_vector.udf)
summary(uniform_genotypes)
## Mode FALSE TRUE
## logical 1886 324
sum(uniform_genotypes) # 324
## [1] 324
# Remove variants with uniform genotypes accross all samples
gt.mx <- gt.mx[!uniform_genotypes,]
gt_chr.mx <- gt_chr.mx[!uniform_genotypes,]
gq.mx <- gq.mx[!uniform_genotypes,]
dp.mx <- dp.mx[!uniform_genotypes,]
fixed.df <- fixed.df[!uniform_genotypes,]
dim(gt.mx)
## [1] 1886 739
dim(gt_chr.mx)
## [1] 1886 739
dim(gq.mx)
## [1] 1886 739
dim(dp.mx)
## [1] 1886 739
dim(fixed.df)
## [1] 1886 110
# Clean-up
rm(non_NA_count.udf, all_NA, uniform_vector.udf, uniform_genotypes)
Genotypes NA rates
Histogram of call rates per variant
Histograms of gq and dp in non-NA genotypes
# Prepare matrices with sub-groups data
gt_nfe.mx <- gt.mx[,542:739]
gq_nfe.mx <- gq.mx[,542:739]
dp_nfe.mx <- dp.mx[,542:739]
gt_wecare.mx <- gt.mx[,1:541]
gq_wecare.mx <- gq.mx[,1:541]
dp_wecare.mx <- dp.mx[,1:541]
# --- Fractions of NA genotypes after filtering --- #
sum(is.na(gt.mx))/(nrow(gt.mx)*ncol(gt.mx)) # ~4.1%
## [1] 0.04149369
sum(is.na(gt_nfe.mx))/(nrow(gt_nfe.mx)*ncol(gt_nfe.mx)) # ~6.2%
## [1] 0.06234937
sum(is.na(gt_wecare.mx))/(nrow(gt_wecare.mx)*ncol(gt_wecare.mx)) # ~3.4%
## [1] 0.03386075
# --- Call rates per variant after filtering --- #
# All samples
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, main="Call rates per variant after filtering\n739 wecare-nfe samples",
xlim=c(0,1), xlab="Call rates")
# nfe
x <- ncol(gt_nfe.mx)
y <- apply(gt_nfe.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, main="Call rates per variant after genotypes filtering\n198 nfe samples",
xlim=c(0,1), xlab="Call rates")
# wecare
x <- ncol(gt_wecare.mx)
y <- apply(gt_wecare.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, main="Call rates per variant after genotypes filtering\n541 wecare samples",
xlim=c(0,1), xlab="Call rates")
# --- gq after filtering (when gt is not NA !) --- #
# All samples
hist(gq.mx[!is.na(gt.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after filtering\n739 wecare-nfe samples", xlab="gq")
hist(gq_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after filtering\n198 nfe samples", xlab="gq")
hist(gq_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, xlim=c(0,100),
main="Histogram of gq in non-NA genotypes after filtering\n541 wecare samples", xlab="gq")
# --- dp after filtering (when gt is not NA !) --- #
# All samples
hist(dp.mx[!is.na(gt.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after filtering\n739 wecare-nfe samples", xlab="dp")
hist(dp.mx[!is.na(gt.mx)], breaks=250, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after filtering (zoom 0:100)\n739 wecare-nfe samples", xlab="dp")
# nfe
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after filtering\n198 nfe samples", xlab="dp")
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=250, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after filtering (zoom 0:100)\n198 nfe samples", xlab="dp")
# wecare
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=50,
main="Histogram of dp in non-NA genotypes after filtering\n541 wecare samples", xlab="dp")
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=250, xlim=c(0,100),
main="Histogram of dp in non-NA genotypes after filtering (zoom 0:100)\n541 wecare samples", xlab="dp")
# Mean GQ and DP in non-NA genotypes after filtering
mean(gq.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 92
## [1] 92.38478
mean(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 76
## [1] 75.92664
quantile(gq.mx[!is.na(gt.mx)], na.rm=TRUE)
## 0% 25% 50% 75% 100%
## 20 99 99 99 99
quantile(dp.mx[!is.na(gt.mx)], na.rm=TRUE)
## 0% 25% 50% 75% 100%
## 10 34 42 70 1000
# Clean-up (keep dp for later comparison of samples)
rm(x,y, gt_nfe.mx, gq_nfe.mx, dp_nfe.mx, gt_wecare.mx, gq_wecare.mx, dp_wecare.mx, gq.mx)
dim(gt.mx)
## [1] 1886 739
class(gt.mx)
## [1] "matrix"
gt.mx[1:5,1:5]
## 100_S8_L007 101_S528_L008 102_S466_L008 103_S147_L007 104_S325_L008
## Var000000003 0 0 0 0 0
## Var000000048 1 2 2 2 2
## Var000000073 0 0 0 0 0
## Var000000095 0 0 0 0 0
## Var000000096 0 0 0 0 0
dim(gt_chr.mx)
## [1] 1886 739
class(gt_chr.mx)
## [1] "matrix"
gt_chr.mx[1:5,1:5]
## 100_S8_L007 101_S528_L008 102_S466_L008 103_S147_L007 104_S325_L008
## Var000000003 "C/C" "C/C" "C/C" "C/C" "C/C"
## Var000000048 "T/C" "C/C" "C/C" "C/C" "C/C"
## Var000000073 "C/C" "C/C" "C/C" "C/C" "C/C"
## Var000000095 "T/T" "T/T" "T/T" "T/T" "T/T"
## Var000000096 "T/T" "T/T" "T/T" "T/T" "T/T"
dim(dp.mx)
## [1] 1886 739
class(dp.mx)
## [1] "matrix"
dp.mx[1:5,1:5]
## 100_S8_L007 101_S528_L008 102_S466_L008 103_S147_L007 104_S325_L008
## Var000000003 304 166 267 146 411
## Var000000048 316 192 145 124 246
## Var000000073 159 46 186 42 62
## Var000000095 58 75 278 63 344
## Var000000096 58 75 278 63 344
dim(fixed.df)
## [1] 1886 110
str(fixed.df)
## 'data.frame': 1886 obs. of 110 variables:
## $ CHROM : chr "1" "1" "1" "1" ...
## $ POS : int 3669172 3677933 3680345 3683914 3683957 3683982 3683997 11735245 11736131 11737010 ...
## $ ID : chr "rs41315312" "rs1181883" "rs144028564" "rs2275839" ...
## $ REF : chr "C" "T" "C" "T" ...
## $ ALT : chr "T" "C" "T" "A" ...
## $ QUAL : num 1167626 908688 46177 70073 198 ...
## $ FILTER : chr "PASS" "PASS" "PASS" "PASS" ...
## $ init_AC : chr "116" "601" "8" "9" ...
## $ init_AF : chr "0.080" "0.417" "5.634e-03" "6.259e-03" ...
## $ init_AN : int 1442 1442 1420 1438 1440 1440 1444 1436 1422 1454 ...
## $ AS_FilterStatus : chr "PASS" "PASS" "PASS" "PASS" ...
## $ AS_VQSLOD : chr "6.2410" "11.6456" "8.9794" "9.6262" ...
## $ AS_culprit : chr "SOR" "FS" "FS" "FS" ...
## $ BaseQRankSum : num 0.742 -1.622 0 0.387 -0.12 ...
## $ ClippingRankSum : num -0.108 -0.021 0.12 -0.389 0.168 0.452 0.095 0.087 0.24 0.715 ...
## $ DB : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ DP : int 203695 74466 64575 119262 118118 110518 110860 244440 191835 127339 ...
## $ DS : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ END : int NA NA NA NA NA NA NA NA NA NA ...
## $ ExcessHet : num 1.2 3.64 3.1 3.12 3.01 ...
## $ FS : num 1.7 0 0 0 0 ...
## $ HaplotypeScore : num NA NA NA NA NA NA NA NA NA NA ...
## $ InbreedingCoeff : num 0.0204 -0.0089 -0.0057 -0.0067 0.2668 ...
## $ MLEAC : chr "116" "600" "8" "9" ...
## $ MLEAF : chr "0.080" "0.416" "5.634e-03" "6.259e-03" ...
## $ MQ : num 60 60 60 60 60 60 60 60 60 60 ...
## $ MQRankSum : num -0.051 0.005 -0.465 0.331 -1.271 ...
## $ NDA : int 1 3 1 1 1 1 2 3 1 1 ...
## $ NEGATIVE_TRAIN_SITE : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ POSITIVE_TRAIN_SITE : logi TRUE TRUE TRUE TRUE TRUE FALSE ...
## $ QD : num 16.45 14.48 11.19 16.22 7.09 ...
## $ ReadPosRankSum : num 0.621 0.397 0.376 0.367 0.216 3.11 0.664 0.849 -0.495 0.693 ...
## $ SOR : num 1.173 0.667 0.702 0.252 0.77 ...
## $ SplitVarID : chr "Var000000003" "Var000000048" "Var000000073" "Var000000095" ...
## $ VQSLOD : num NA NA NA NA NA NA NA NA NA NA ...
## $ culprit : chr NA NA NA NA ...
## $ exac_non_TCGA.AC : chr "7859" "44134" "684" "2082" ...
## $ exac_non_TCGA.AC_AFR : chr "496" "5759" "12" "11" ...
## $ exac_non_TCGA.AC_AMR : chr "428" "5474" "64" "692" ...
## $ exac_non_TCGA.AC_Adj : chr "7856" "44112" "684" "2075" ...
## $ exac_non_TCGA.AC_EAS : chr "428" "2227" "0" "940" ...
## $ exac_non_TCGA.AC_FEMALE: chr "2983" "19316" "306" "1098" ...
## $ exac_non_TCGA.AC_FIN : chr "563" "2545" "72" "105" ...
## $ exac_non_TCGA.AC_Hemi : chr NA NA NA NA ...
## $ exac_non_TCGA.AC_Het : chr "7192" "24906" "672" "1901" ...
## $ exac_non_TCGA.AC_Hom : chr "332" "9603" "6" "87" ...
## $ exac_non_TCGA.AC_MALE : chr "4873" "24796" "378" "977" ...
## $ exac_non_TCGA.AC_NFE : chr "3675" "20768" "520" "93" ...
## $ exac_non_TCGA.AC_OTH : chr "56" "280" "4" "11" ...
## $ exac_non_TCGA.AC_SAS : chr "2210" "7059" "12" "223" ...
## $ exac_non_TCGA.AF : chr "0.074" "0.416" "6.440e-03" "0.020" ...
## $ exac_non_TCGA.AN : int 106210 106210 106210 106210 106210 106210 106200 NA 106210 106206 ...
## $ exac_non_TCGA.AN_AFR : int 9058 9060 9060 9014 8922 8536 7914 NA 8902 8896 ...
## $ exac_non_TCGA.AN_AMR : int 11216 11194 11194 11148 11104 10784 9992 NA 11070 11114 ...
## $ exac_non_TCGA.AN_Adj : int 106196 106120 106140 105806 105114 101574 94348 NA 105034 105596 ...
## $ exac_non_TCGA.AN_EAS : int 7862 7842 7852 7842 7848 7722 7254 NA 7834 7852 ...
## $ exac_non_TCGA.AN_FEMALE: chr "45870" "45836" "45832" "45656" ...
## $ exac_non_TCGA.AN_FIN : int 6614 6606 6612 6572 6414 5988 5426 NA 6450 6594 ...
## $ exac_non_TCGA.AN_MALE : chr "60326" "60284" "60308" "60150" ...
## $ exac_non_TCGA.AN_NFE : int 54344 54316 54330 54182 53942 52516 49330 NA 53998 54144 ...
## $ exac_non_TCGA.AN_OTH : int 694 694 694 686 672 642 590 NA 676 690 ...
## $ exac_non_TCGA.AN_SAS : int 16408 16408 16398 16362 16212 15386 13842 NA 16104 16306 ...
## $ exac_non_TCGA.Hemi_AFR : chr NA NA NA NA ...
## $ exac_non_TCGA.Hemi_AMR : chr NA NA NA NA ...
## $ exac_non_TCGA.Hemi_EAS : chr NA NA NA NA ...
## $ exac_non_TCGA.Hemi_FIN : chr NA NA NA NA ...
## $ exac_non_TCGA.Hemi_NFE : chr NA NA NA NA ...
## $ exac_non_TCGA.Hemi_OTH : chr NA NA NA NA ...
## $ exac_non_TCGA.Hemi_SAS : chr NA NA NA NA ...
## $ exac_non_TCGA.Het_AFR : chr "476" "2111" "12" "11" ...
## $ exac_non_TCGA.Het_AMR : chr "412" "2746" "64" "652" ...
## $ exac_non_TCGA.Het_EAS : chr "412" "1565" "0" "814" ...
## $ exac_non_TCGA.Het_FIN : chr "517" "1565" "72" "105" ...
## $ exac_non_TCGA.Het_NFE : chr "3401" "12746" "510" "89" ...
## $ exac_non_TCGA.Het_OTH : chr "48" "166" "4" "11" ...
## $ exac_non_TCGA.Het_SAS : chr "1926" "4007" "10" "219" ...
## $ exac_non_TCGA.Hom_AFR : chr "10" "1824" "0" "0" ...
## $ exac_non_TCGA.Hom_AMR : chr "8" "1364" "0" "20" ...
## $ exac_non_TCGA.Hom_EAS : chr "8" "331" "0" "63" ...
## $ exac_non_TCGA.Hom_FIN : chr "23" "490" "0" "0" ...
## $ exac_non_TCGA.Hom_NFE : chr "137" "4011" "5" "2" ...
## $ exac_non_TCGA.Hom_OTH : chr "4" "57" "0" "0" ...
## $ exac_non_TCGA.Hom_SAS : chr "142" "1526" "1" "2" ...
## $ kgen.AC : chr "390" "2384" "17" "137" ...
## $ kgen.AF : chr "0.0778754" "0.476038" "0.00339457" "0.0273562" ...
## $ kgen.AFR_AF : chr "0.0461" "0.6944" "0.0015" "0.003" ...
## $ kgen.AMR_AF : chr "0.0418" "0.4683" "0.0086" "0.0346" ...
## $ kgen.AN : int 5008 5008 5008 5008 5008 NA 5008 5008 5008 NA ...
## $ kgen.EAS_AF : chr "0.0645" "0.2788" "0" "0.0883" ...
## $ kgen.EUR_AF : chr "0.0875" "0.4354" "0.0089" "0.003" ...
## $ kgen.SAS_AF : chr "0.1503" "0.4315" "0" "0.0174" ...
## $ SYMBOL : chr "CCDC27" "CCDC27" "CCDC27" "CCDC27" ...
## $ Allele : chr "T" "C" "T" "A" ...
## $ Existing_variation : chr "rs41315312" "rs1181883" "rs144028564" "rs2275839" ...
## $ Consequence : chr "missense_variant" "missense_variant" "missense_variant" "missense_variant" ...
## $ IMPACT : chr "MODERATE" "MODERATE" "MODERATE" "MODERATE" ...
## $ CLIN_SIG : chr NA NA NA NA ...
## $ cDNA_position : chr "211" "884" "1481" "1732" ...
## $ CDS_position : chr "127" "800" "1397" "1648" ...
## [list output truncated]
fixed.df[1:5,1:5]
## CHROM POS ID REF ALT
## Var000000003 1 3669172 rs41315312 C T
## Var000000048 1 3677933 rs1181883 T C
## Var000000073 1 3680345 rs144028564 C T
## Var000000095 1 3683914 rs2275839 T A
## Var000000096 1 3683957 rs146757641 T C
# Check consistence of rownames
sum(rownames(gt.mx) != rownames(gt_chr.mx))
## [1] 0
sum(rownames(gt.mx) != rownames(dp.mx))
## [1] 0
sum(colnames(gt.mx) != colnames(gt_chr.mx))
## [1] 0
sum(colnames(gt.mx) != colnames(dp.mx))
## [1] 0
sum(rownames(gt.mx) != rownames(fixed.df))
## [1] 0
save.image(paste(base_folder, "r02_filter_genotypes.RData", sep="/"))
ls()
## [1] "base_folder" "dp.mx" "fixed.df" "gt_chr.mx" "gt.mx"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.21
##
## loaded via a namespace (and not attached):
## [1] compiler_3.5.1 magrittr_1.5 tools_3.5.1 htmltools_0.3.6 yaml_2.2.0 Rcpp_1.0.0 stringi_1.2.4 rmarkdown_1.11 stringr_1.3.1 xfun_0.4 digest_0.6.18 evaluate_0.12
Sys.time()
## [1] "2019-04-23 15:01:12 BST"